import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from sklearn.datasets import make_regression
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.manifold import MDS
from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.metrics import mean_squared_error
from yellowbrick.features import Manifold
# Define Data
data = np.array([[1, 2],
[3, 4],
[5, 6]])
# Compute colmeans
means = np.mean(data.T, axis=1)
# Center the data
centered = data - means
# Find the covariance matrix
# Diagonal = Variance of the Component
# Symmetric Off Diag = Variance Between Components
covariance = np.cov(centered.T)
# Perform eigenvalue decomposition
eigval, eigvec = np.linalg.eig(covariance)
explained = np.array([eigval[i] ** 2 / sum(eigval ** 2) for i in range(len(eigval))])
# Project data
projected = eigvec.T.dot(centered.T)
print('Eignenvalues', eigval)
print('Explained Variance Total', explained)
print('Eigenvectors \n', eigvec.T)
print('Projected\n', projected.T)
pcaTest = PCA(2)
pcaResults = pcaTest.fit(data)
print('Explained Variance', pcaResults.explained_variance_)
print('Explained Variance Total', pcaResults.explained_variance_ratio_)
print('Components\n', pcaResults.components_)
print('Projected\n', pcaResults.transform(data))
digits = load_digits()
X = digits['data']
y = digits['target']
X.shape
def plotDigits(data, y=y, skip=5):
plt.figure(figsize=(12,10))
plt.scatter(data[:,0], data[:,1], alpha=0.5, cmap='viridis', c=y)
plot = 0
for ix, num in enumerate(y):
plot += 1
if plot % skip == 0:
plt.annotate(str(num), (data[ix, 0], data[ix, 1]))
pca = PCA(n_components=X.shape[1])
pca.fit_transform(X)
pca.explained_variance_ratio_
plt.figure(figsize=(10,8))
plt.bar(range(10), pca.explained_variance_ratio_[0:10])
plt.xlabel('Components')
plt.ylabel('Importance')
pca.explained_variance_ratio_.sum()
plt.figure(figsize=(10,8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Components')
plt.ylabel('Cumulative Explained Var')
pca.explained_variance_ratio_[0:5].sum()
pca = PCA(n_components=64)
projected = pca.fit_transform(X)
plotDigits(projected)
plt.figure(figsize=(12,10))
plt.scatter(projected[:,-50], projected[:,-49], alpha=0.5,
cmap='viridis', c=y)
svd = TruncatedSVD(n_components=10,
algorithm='randomized',
n_iter=5,
random_state=1337)
X_train_svd = svd.fit_transform(X)
plotDigits(X_train_svd)
plt.figure(figsize=(10,8))
plt.bar(range(10), sorted(svd.explained_variance_ratio_[0:10], reverse=True))
plt.xlabel('Components')
plt.ylabel('Importance')
@interact
def spaceSaver(rows='100', cols='100', components='5'):
rows = int(rows)
cols = int(cols)
components = int(components)
print('Original Space', rows * cols)
print('SVD Space', rows * components + cols * components + components)
print('\n')
print(f'U Shape ({rows}, {components})')
print(f'S Shape ({components}, {components})')
print(f'V.T Shape ({components}, {cols})')
grp = GaussianRandomProjection(n_components=2,
eps=.5,
random_state=1002)
X_train_grp = grp.fit_transform(X)
plotDigits(X_train_grp)
tSNE = TSNE(n_components=2,
learning_rate=300,
perplexity=30,
early_exaggeration=10,
random_state=1000)
X_train_tsne = tSNE.fit_transform(X)
plotDigits(X_train_tsne)
lle = LocallyLinearEmbedding(n_neighbors = 10,
n_components = 2)
X_train_lle = lle.fit_transform(X)
plotDigits(X_train_lle)
mds = MDS(n_components=2,
n_init=10,
max_iter=100)
X_train_mds = mds.fit_transform(X)
plotDigits(X_train_mds)
v = ['lle', 'ltsa', 'hessian', 'modified', 'isomap', 'mds', 'spectral', 'tsne']
@interact
def yellowbrickDigits(manifold=v):
fig, ax = plt.subplots(figsize=(12,10))
viz = Manifold(manifold=manifold, ax=ax)
viz.fit_transform(X, y)
@interact
def reconstruct(explained=(0.0, 1.0, 0.05), image=(0, 9)):
if explained >= 1.0: explained = 64
if explained <= 0.0: explained = 2
pcaRecon = PCA(explained)
X_pca = pcaRecon.fit_transform(X)
print(f'Components {pcaRecon.n_components_}')
approx = pcaRecon.inverse_transform(X_pca)
plt.subplot(1,2,1)
plt.imshow(X[image].reshape(8,8))
plt.title('Original')
plt.subplot(1,2,2)
plt.imshow(approx[image].reshape(8,8))
plt.title('Reconstructed')
mse = mean_squared_error(approx[image], X[image])
print(f'MSE = {mse}')
fig, axes = plt.subplots(4, 10, figsize=(10, 4),
subplot_kw={'xticks':[], 'yticks':[]},
gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
ax.imshow(X[i].reshape(8, 8), cmap='binary', interpolation='nearest',clim=(0, 16))
noise = np.random.normal(X, 4)
fig, axes = plt.subplots(4, 10, figsize=(10, 4),
subplot_kw={'xticks':[], 'yticks':[]},
gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
ax.imshow(noise[i].reshape(8, 8), cmap='binary', interpolation='nearest',clim=(0, 16))
# Train on 50% variance
pcaNoise = PCA(0.5)
pcaNoise.fit(noise)
filtered = pcaNoise.transform(noise)
filtered = pcaNoise.inverse_transform(filtered)
fig, axes = plt.subplots(4, 10, figsize=(10, 4),
subplot_kw={'xticks':[], 'yticks':[]},
gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
ax.imshow(filtered[i].reshape(8, 8), cmap='binary', interpolation='nearest',clim=(0, 16))